rm(list=ls(all=TRUE))
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
Read combined metabolomics dataset:
data_wide <- read_csv("Data/data_final.csv")
## Rows: 44570 Columns: 80
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Feature_ID, Dataset
## dbl (78): Row_ID, Row_MZ, Row_RT, Correlation_Group_ID, EMR_04_10_MD, EMR_04...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Make separate table for feature metadata:
feature_info <- data_wide %>%
select("Feature_ID", "Dataset", "Row_ID", "Row_MZ", "Row_RT", "Correlation_Group_ID")
feature_info
Transform into long format:
data_long <- data_wide %>%
select(Feature_ID, starts_with("EMR_04")) %>%
pivot_longer(cols = c(starts_with("EMR_04")), names_to = "Sample_ID", values_to = "Log10_Ratio")
rm(data_wide)
data_long
Read sample metadata:
sample_info <- read_tsv("Data/sample_metadata.tsv")
## Rows: 74 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (15): Sample_Identifier, SampleCollectionDateandTime, Metabolomics_Data,...
## dbl (3): AgeInYears, Sebumeter_Score, Skicon_Score
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Select and rename colums:
sample_info <- sample_info %>%
select(Sample_ID = Sample_Identifier, Study_Group = ATTRIBUTE_Study_Group,
Age = AgeInYears, Sex = ATTRIBUTE_BiologicalSex, Ethnic_Group = ATTRIBUTE_Ethnic_Group,
Sebumeter_Score, Skicon_Score)
sample_info
Sample numbers:
sample_info %>%
count(Study_Group)
Check distribution of numerical parameters:
sample_info <- sample_info %>%
mutate(
Log10_Sebumeter_Score = log10(Sebumeter_Score),
Log10_Skicon_Score = log10(Skicon_Score)
)
sample_info %>%
pivot_longer(
cols = c(
Sebumeter_Score, Log10_Sebumeter_Score,
Skicon_Score, Log10_Skicon_Score,
Age
),
names_to = "Parameter", values_to = "Value"
) %>%
filter(!is.na(Value)) %>%
mutate(
Parameter = factor(
Parameter,
levels = c(
"Sebumeter_Score", "Log10_Sebumeter_Score",
"Skicon_Score", "Log10_Skicon_Score",
"Age"
)
)
) %>%
ggplot() +
geom_histogram(aes(Value), bins = 10) +
facet_wrap("Parameter", ncol = 2, scales = "free")
Nested tibble:
data_nested <- data_long %>%
inner_join(
sample_info,
by = "Sample_ID"
) %>%
group_by(Feature_ID) %>%
nest() %>%
ungroup()
data_nested$data[[1]]
Study_Group is the variable of interest, Age, Sex, Ethnic_Group, and Log10_Skicon_Score are confounding variables.
Fit linear models:
data_nested <- data_nested %>%
mutate(
Models = data %>% map(
~ lm(Log10_Ratio ~ Study_Group + Age + Sex + Ethnic_Group + Log10_Skicon_Score,
data = ., na.action = na.omit) %>% try()
),
Classes = Models %>% map_chr(~ paste(class(.), collapse = " "))
)
## Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) :
## contrasts can be applied only to factors with 2 or more levels
## Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) :
## contrasts can be applied only to factors with 2 or more levels
## Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) :
## contrasts can be applied only to factors with 2 or more levels
data_nested %>% count(Classes)
Extract model coefficients:
stats_study_group <- data_nested %>%
filter(Classes == "lm") %>%
mutate(
Coefficients = Models %>% map(tidy)
) %>%
select(Feature_ID, Coefficients) %>%
unnest(Coefficients) %>%
filter(term == "Study_GroupOily Scalp")
stats_study_group
Sebumeter_Score + Log10_Skicon_Score are the variables of interest, Age, Sex, and Ethnic_Group are confounding variables.
Fit linear models:
data_nested <- data_nested %>%
mutate(
Models = data %>% map(
~ lm(Log10_Ratio ~ Age + Sex + Ethnic_Group + Sebumeter_Score + Log10_Skicon_Score,
data = ., na.action = na.omit) %>% try()
),
Classes = Models %>% map_chr(~ paste(class(.), collapse = " "))
)
## Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) :
## contrasts can be applied only to factors with 2 or more levels
## Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) :
## contrasts can be applied only to factors with 2 or more levels
data_nested %>% count(Classes)
Extract model coefficients:
stats_physical_skin_parameters <- data_nested %>%
filter(Classes == "lm") %>%
mutate(
Coefficients = Models %>% map(tidy)
) %>%
select(Feature_ID, Coefficients) %>%
unnest(Coefficients) %>%
filter(term == "Sebumeter_Score" | term == "Log10_Skicon_Score")
stats_physical_skin_parameters
Combine stats results:
feature_stats <- bind_rows(stats_study_group, stats_physical_skin_parameters)
rm(data_nested, stats_study_group, stats_physical_skin_parameters)
feature_stats
Add contrast labels and format stats results for Cytoscape:
feature_stats <- feature_stats %>%
mutate(
p.volcano = -log10(p.value),
p.volc.dir = p.volcano * sign(statistic),
Contrast_Label = term %>% case_match(
"Study_GroupOily Scalp" ~ "groups|oily-norm",
"Sebumeter_Score" ~ "base|sebum",
"Log10_Skicon_Score" ~ "base|skicon"
),
p_value = p.value %>% round(digits = 3) %>% as.character(),
p_volcano = p.volcano %>% round(digits = 3) %>% as.character(),
p_volc_dir = p.volc.dir %>% round(digits = 3) %>% as.character(),
p_cat = case_when(
p.value < 0.001 ~ "< 0.001",
p.value < 0.01 ~ "< 0.01",
p.value < 0.1 ~ "< 0.1"
),
p_cat_dir = case_when(
is.na(p_cat) ~ NA_character_,
statistic > 0 ~ paste0("Up, ", p_cat),
statistic < 0 ~ paste0("Dn, ", p_cat)
)
)
feature_stats
Combine stats results with feature info and write to file:
feature_stats_info <- feature_info %>%
select(Feature_ID, Dataset, Row_ID) %>%
inner_join(
feature_stats %>% select(Feature_ID, Contrast_Label, starts_with("p_")),
by = "Feature_ID"
)
feature_stats_info %>% write_csv("Analysis/feature_stats.csv")
feature_stats_info
Write results for to file for loading into Cytoscape (individual formats, individual datasets):
feature_stats_info %>%
pivot_longer(cols = starts_with("p_"), names_to = "Format", values_to = "Value", values_drop_na = T) %>%
pivot_wider(names_from = Contrast_Label, values_from = Value) %>%
select(Dataset, Format, Row_ID, contains("|")) %>%
group_by(Dataset, Format) %>%
nest() %>%
mutate(
Filename = str_c(Dataset, Format, "tsv", sep = "."),
data = map2(data, Format, \(data, Format) data %>% rename_with(\(x) str_c(Format, x, sep = "|"), contains("|"))),
data = map2(data, Filename, \(data, Filename) data %>% write_tsv(str_c("Analysis/", Filename), na = ""))
)
dir("Analysis")
## [1] "C18_neg.p_cat_dir.tsv" "C18_neg.p_cat.tsv"
## [3] "C18_neg.p_value.tsv" "C18_neg.p_volc_dir.tsv"
## [5] "C18_neg.p_volcano.tsv" "C18_pos.p_cat_dir.tsv"
## [7] "C18_pos.p_cat.tsv" "C18_pos.p_value.tsv"
## [9] "C18_pos.p_volc_dir.tsv" "C18_pos.p_volcano.tsv"
## [11] "feature_stats.csv" "HILIC_neg.p_cat_dir.tsv"
## [13] "HILIC_neg.p_cat.tsv" "HILIC_neg.p_value.tsv"
## [15] "HILIC_neg.p_volc_dir.tsv" "HILIC_neg.p_volcano.tsv"
## [17] "HILIC_pos.p_cat_dir.tsv" "HILIC_pos.p_cat.tsv"
## [19] "HILIC_pos.p_value.tsv" "HILIC_pos.p_volc_dir.tsv"
## [21] "HILIC_pos.p_volcano.tsv"